#loading packages
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.5 ✓ stringr 1.4.0
## ✓ tidyr 1.1.2 ✓ forcats 0.5.0
## ✓ readr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
library(ggridges) # for joy plots
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gganimate) # for adding animation layers to ggplots
library(gifski) # for creating the gif (don't need to load this library every time,but need it installed)
#loading data
spotify <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv')
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## track_id = col_character(),
## track_name = col_character(),
## track_artist = col_character(),
## track_album_id = col_character(),
## track_album_name = col_character(),
## track_album_release_date = col_character(),
## playlist_name = col_character(),
## playlist_id = col_character(),
## playlist_genre = col_character(),
## playlist_subgenre = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
spotify_rap <- spotify %>%
filter(playlist_genre == "rap")
randb <- spotify %>%
filter(playlist_genre == "r&b") %>%
select(-track_id, - track_album_id, -playlist_id) %>%
filter(track_popularity >= 75)
Why did we do an analysis on spotify? Why is the data significant & why should people care? In troduce the data to audience
prelim_graph <- spotify %>%
ggplot(aes(y = playlist_genre, x = track_popularity)) +
labs(title = "Song Popularity by Genre",
x = "", y = "",
subtitle = "Song popularity is measured from 0-100, with higher numbers being indiciative of more popularity.\nHighest median popularities belong to pop and latin with an overall median popularity of 40",
caption = "Alex Ismail, Malek Kaloti, Brian Lee") +
theme_classic() +
theme(plot.title.position = "plot",
plot.title = element_text(size = 20, face = "bold"),
plot.subtitle = element_text(size = 10, face = "italic")) +
geom_boxplot() +
geom_vline(aes(xintercept = median(track_popularity, na.rm = TRUE)), color = "blue")
prelim_graph
feature_names <- names(spotify)[12:23]
density_plot <- spotify %>%
select(c('playlist_genre', feature_names)) %>%
pivot_longer(cols = feature_names) %>%
ggplot(aes(x = value)) +
geom_density(aes(color = playlist_genre), alpha = 0.5) +
facet_wrap(~name, ncol = 3, scales = 'free') +
labs(title = 'Spotify Audio Feature Density - by Genre',
x = '', y = 'density') +
theme(axis.text.y = element_blank())
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(feature_names)` instead of `feature_names` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
ggplotly(density_plot)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
spotify %>%
filter(track_popularity >= 75) %>%
ggplot(aes(x = track_popularity, y = playlist_genre)) +
labs(x = "Popularity", y = "Playlist Genre") +
geom_density_ridges() +
theme_ridges()
## Picking joint bandwidth of 1.37
#get rid of axes, add a more descriptive subtitle
Data retrieved from github, (add link).
spotify_rap %>%
mutate(Rounded_Danceability = round(danceability, digits = 1),
Rounded_Energy = round(energy, digits = 1),
Rounded_Speechiness = round(speechiness, digits = 1),
Rounded_Instrumental = round(instrumentalness, digits = 1),
popular = track_popularity > 75) %>%
pivot_longer(cols = starts_with("Rounded"),
names_to = "Stat1",
values_to = "Rounded_Value") %>%
group_by(Stat1, Rounded_Value) %>%
summarize(Pop_Rate = mean(popular)*100) %>%
mutate(Stat = fct_recode(Stat1,
Danceability = "Rounded_Danceability",
Energy = "Rounded_Energy",
Speechiness = "Rounded_Speechiness",
Instrumental = "Rounded_Instrumental")) %>%
ggplot(aes(x = Rounded_Value, y = Pop_Rate)) +
geom_line(aes(color = Stat)) +
labs(title = "Popularity of Rap Songs by Song Characteristic",
x = "", y = "Percent Popular", color = "Song Statistic") +
theme_classic() +
theme(plot.title.position = "plot",
plot.title = element_text(size = 20, face = "bold"),
plot.subtitle = element_text(size = 10, face = "italic"))
## `summarise()` regrouping output by 'Stat1' (override with `.groups` argument)
## Warning: Problem with `mutate()` input `Stat`.
## ℹ Unknown levels in `f`: Rounded_Energy, Rounded_Speechiness, Rounded_Instrumental
## ℹ Input `Stat` is `fct_recode(...)`.
## ℹ The error occurred in group 1: Stat1 = "Rounded_Danceability".
## Warning: Unknown levels in `f`: Rounded_Energy, Rounded_Speechiness,
## Rounded_Instrumental
## Warning: Problem with `mutate()` input `Stat`.
## ℹ Unknown levels in `f`: Rounded_Danceability, Rounded_Speechiness, Rounded_Instrumental
## ℹ Input `Stat` is `fct_recode(...)`.
## ℹ The error occurred in group 2: Stat1 = "Rounded_Energy".
## Warning: Unknown levels in `f`: Rounded_Danceability, Rounded_Speechiness,
## Rounded_Instrumental
## Warning: Problem with `mutate()` input `Stat`.
## ℹ Unknown levels in `f`: Rounded_Danceability, Rounded_Energy, Rounded_Speechiness
## ℹ Input `Stat` is `fct_recode(...)`.
## ℹ The error occurred in group 3: Stat1 = "Rounded_Instrumental".
## Warning: Unknown levels in `f`: Rounded_Danceability, Rounded_Energy,
## Rounded_Speechiness
## Warning: Problem with `mutate()` input `Stat`.
## ℹ Unknown levels in `f`: Rounded_Danceability, Rounded_Energy, Rounded_Instrumental
## ℹ Input `Stat` is `fct_recode(...)`.
## ℹ The error occurred in group 4: Stat1 = "Rounded_Speechiness".
## Warning: Unknown levels in `f`: Rounded_Danceability, Rounded_Energy,
## Rounded_Instrumental
spotify %>%
mutate(track_name_lower = str_to_lower(track_name),
remix = str_detect(track_name_lower, "Remix"),
feature = str_detect(track_name_lower, "feat"),
ma_prep = remix|feature,
ma_prep2 = replace_na(ma_prep, FALSE),
multiple_artists = if_else(ma_prep2, true = "Multiple Artists", false = "One Artist"),
popular = track_popularity > 75) %>%
group_by(multiple_artists, playlist_genre) %>%
summarize(prop_pop = mean(popular)*100) %>%
mutate(genre = fct_relevel(playlist_genre, "rap")) %>%
ggplot() +
geom_col(aes(x = multiple_artists, y = prop_pop)) +
facet_wrap(~genre) +
labs(title = "Popularity of Songs Containing Mulitple Artists Across Genre",
x = "", y = "Percent of Songs Popular") +
theme_classic() +
theme(plot.title.position = "plot",
plot.title = element_text(size = 20, face = "bold"),
plot.subtitle = element_text(size = 10, face = "italic"))
## `summarise()` regrouping output by 'multiple_artists' (override with `.groups` argument)
head(randb)
## # A tibble: 6 x 20
## track_name track_artist track_popularity track_album_name track_album_rel…
## <chr> <chr> <dbl> <chr> <chr>
## 1 Life Is G… Future 93 Life Is Good (f… 2020-01-10
## 2 Ayy Macar… Tyga 91 Ayy Macarena 2019-11-13
## 3 HIGHEST I… Travis Scott 89 JACKBOYS 2019-12-27
## 4 FML Arizona Zer… 82 Living Facts 2018-06-03
## 5 OUT WEST … JACKBOYS 87 JACKBOYS 2019-12-27
## 6 Out Of Yo… French Mont… 75 MONTANA 2019-12-06
## # … with 15 more variables: playlist_name <chr>, playlist_genre <chr>,
## # playlist_subgenre <chr>, danceability <dbl>, energy <dbl>, key <dbl>,
## # loudness <dbl>, mode <dbl>, speechiness <dbl>, acousticness <dbl>,
## # instrumentalness <dbl>, liveness <dbl>, valence <dbl>, tempo <dbl>,
## # duration_ms <dbl>
randb %>%
ggplot(aes(x = track_popularity, fill = playlist_subgenre, color = playlist_subgenre)) +
geom_density(alpha = 0.1) +
labs(title = "ADD TITLE",
subtitle = "R&B Subgenre: {closest_state}") +
transition_states(playlist_subgenre, transition_length = 3, state_length = 1)
#get rid of axes, make subtitle descriptive
anim_save("randb_density.gif")
knitr::include_graphics("randb_density.gif")
Why do hip pop and urban contemp have such similar density curves? For this section I want to look at the features of these two genres specifically.
randb %>%
group_by(playlist_subgenre) %>%
filter(playlist_subgenre == c("hip pop", "urban contemporary")) %>%
summarise_at(c("track_popularity", "danceability", "energy", "key", "loudness", "mode", "speechiness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms"), mean, na.rm = TRUE) %>%
knitr::kable()
## Warning in playlist_subgenre == c("hip pop", "urban contemporary"): longer
## object length is not a multiple of shorter object length
## Warning in playlist_subgenre == c("hip pop", "urban contemporary"): longer
## object length is not a multiple of shorter object length
| playlist_subgenre | track_popularity | danceability | energy | key | loudness | mode | speechiness | instrumentalness | liveness | valence | tempo | duration_ms |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| hip pop | 82.62411 | 0.6985887 | 0.6000979 | 5.014184 | -6.380170 | 0.6808511 | 0.1304929 | 0.0120022 | 0.1580014 | 0.4780922 | 116.8704 | 200865.0 |
| urban contemporary | 81.98039 | 0.6823333 | 0.5401578 | 5.696078 | -7.651382 | 0.4803922 | 0.1340971 | 0.0135849 | 0.1504039 | 0.4606735 | 121.0225 | 207035.3 |
# maybe somehow graph this??